.hdr <- "result/step1/matrix_final"
.data <- fileset.list(.hdr)
.file <- "result/step2/prot_bbknn.rds"
if.needed(.file, {
.batches <- .fread(.prot.data$col, col.names="tag")
.batches[, c("barcode","batch") := tstrsplit(tag, "_")]
.bbknn <- rcpp_mmutil_bbknn_pca(.prot.data$mtx,
.batches$batch,
knn = 10, RANK = 20,
EM_ITER = 20,
TAKE_LN = TRUE)
saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
Run annotation purely based on marker genes:
.file <- "result/step2/prot_annot.txt.gz"
if.needed(.file, {
.pos.markers <- .read.marker.file("marker/surface_round1_positive.txt")
.neg.markers <- .read.marker.file("marker/surface_round1_negative.txt")
.annot.out <-
rcpp_mmutil_annotate_columns(
pos_labels = list(p1=.pos.markers),
r_neg_labels = list(n1=.neg.markers),
row_file = .prot.data$row,
col_file = .prot.data$col,
r_U = .bbknn$U,
r_D = .bbknn$D,
r_V = .bbknn$factors.adjusted,
EM_TOL = 1e-6,
EM_ITER = 500)
annot.dt <- setDT(.annot.out$annotation)
.col <- c("tag", "celltype", "prob", "ln.prob")
names(annot.dt) <- .col
annot.dt[, c("barcode","batch") := tstrsplit(tag, split="_")]
fwrite(annot.dt, .file)
})
annot.dt <- fread(.file)
Two-dimensional density plot on the normalized CD marker expressions.
U <- .bbknn$U
D <- .bbknn$D
V <- .bbknn$factors.adjusted
prot.mtx <- pmax(exp(sweep(U, 2, D, `*`) %*% t(V)) - 1, 0)
prot.mtx <- apply(prot.mtx, 2, function(x) x/pmax(sum(x),1) * 1e4)
rownames(prot.mtx) <- readLines(.prot.data$row)
colnames(prot.mtx) <- readLines(.prot.data$col)
.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.mtx)
print(plt)
.file <- fig.dir %&% "/Fig_round1_cdmarker_bbknn.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)
Two-dimensional density plot on the raw CD marker concentrations.
prot.raw.mtx <- read.dense(.prot.data$mtx)
rownames(prot.raw.mtx) <- readLines(.prot.data$row)
colnames(prot.raw.mtx) <- readLines(.prot.data$col)
.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.raw.mtx)
print(plt)
.file <- fig.dir %&% "/Fig_round1_cdmarker.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)
.file <- "result/step2/gene_bbknn.rds"
if.needed(.file, {
.batches <- .fread(.gene.data$col, col.names="tag")
.batches[, c("barcode","batch") := tstrsplit(tag, "_")]
.bbknn <- rcpp_mmutil_bbknn_pca(.gene.data$mtx,
.batches$batch,
knn = 10, RANK = 30,
EM_ITER = 20,
TAKE_LN = TRUE,
NUM_THREADS = 8)
saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
run.bbknn.umap <- function(.bbknn, .cells, ...){
.df <- data.frame(from = .bbknn$knn$src.index,
to = .bbknn$knn$tgt.index,
weight = .bbknn$knn$weight)
G <- igraph::graph_from_data_frame(.df, directed=FALSE)
nrepeat <- 20
C <- list(modularity = 0)
for(r in 1:nrepeat){
c.r <- igraph::cluster_louvain(G, resolution = .75)
message(paste("modularity: ", c.r$modularity, "\n"))
if(r == 1 || max(c.r$modularity) > max(C$modularity)){
C <- c.r
}
}
.clust <- data.table(tag = .cells[as.integer(C$names)],
membership = C$membership)
A <- igraph::as_adj(G)
umap.A <- uwot::optimize_graph_layout(A, verbose = TRUE, n_sgd_threads = "auto", ...)
.dt <- data.table(tag = .cells[as.integer(rownames(A))],
umap1 = umap.A[,1],
umap2 = umap.A[,2]) %>%
left_join(.clust, by = "tag")
}
Build a batch-balancing kNN graph
Louvain graph-based clustering
Construct UMAP the kNN backbone graph
.file <- "result/step2/gene_bbknn_louvain.txt.gz"
.cells <- readLines(.gene.data$col)
if.needed(.file, {
.bbknn.umap <- run.bbknn.umap(.bbknn, .cells, spread=3)
fwrite(.bbknn.umap, .file)
})
.bbknn.umap <- fread(.file) %>%
left_join(annot.dt, by = "tag") %>%
mutate(membership = as.factor(membership))
p1 <-
.gg.plot(.bbknn.umap, aes(umap1, umap2, color=membership)) +
geom_point(stroke=0, alpha=.8, size=.5)
ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300)
geom_point: na.rm = FALSE stat_identity: na.rm = FALSE position_identity
p2 <-
.gg.plot(.bbknn.umap, aes(umap1, umap2, color=celltype)) +
ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300) +
scale_color_brewer(palette = "Paired")
plt <- p1 | p2
print(plt)
.file <- fig.dir %&% "/Fig_bbknn_gene_only_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)
.ct.mem <- .bbknn.umap[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]
.df <-
.ct.mem %>%
mutate(row = celltype, col = membership,
weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>%
order.pair(ret.tab = TRUE)
plt <-
.gg.plot(.df, aes(col, row, fill=weight, size = `nc`)) +
xlab("Louvain clustering") +
ylab("cell type") +
theme(legend.position = "bottom") +
geom_point(pch=22) +
scale_size_continuous("#cells", range=c(0,5)) +
scale_fill_distiller("proportion",
palette="PuRd", direction = 1,
labels=function(x) round(sigmoid(x), 2))
print(plt)
.file <- fig.dir %&% "/Fig_bbknn_gene_only_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)
.file <- "result/step2/full_bbknn.rds"
if.needed(.file, {
.batches <- .fread(.data$col, col.names="tag")
.batches[, c("barcode","batch") := tstrsplit(tag, "_")]
.bbknn <- rcpp_mmutil_bbknn_pca(.data$mtx,
.batches$batch,
knn = 10, RANK = 30,
EM_ITER = 20,
TAKE_LN = TRUE,
NUM_THREADS = 8)
saveRDS(.bbknn, .file)
})
.bbknn <- readRDS(.file)
.file <- "result/step2/prot_annot_full.txt.gz"
if.needed(.file, {
.pos.markers <- .read.marker.file("marker/surface_round1_positive.txt")
.neg.markers <- .read.marker.file("marker/surface_round1_negative.txt")
.annot.out <-
rcpp_mmutil_annotate_columns(
pos_labels = list(p1=.pos.markers),
r_neg_labels = list(n1=.neg.markers),
row_file = .data$row,
col_file = .data$col,
r_U = .bbknn$U,
r_D = .bbknn$D,
r_V = .bbknn$factors.adjusted,
EM_TOL = 1e-6,
EM_ITER = 500)
annot.dt <- setDT(.annot.out$annotation)
.col <- c("tag", "celltype", "prob", "ln.prob")
names(annot.dt) <- .col
annot.dt[, c("barcode","batch") := tstrsplit(tag, split="_")]
fwrite(annot.dt, .file)
})
annot.dt <- fread(.file)
Two-dimensional density plot on the raw CD marker concentrations.
prot.raw.mtx <- read.dense(.prot.data$mtx)
rownames(prot.raw.mtx) <- readLines(.prot.data$row)
colnames(prot.raw.mtx) <- readLines(.prot.data$col)
.ct <- c("mTreg","nTreg","mTconv","nTconv")
plt <- plt.scatter.ct.2(.ct, annot.dt, prot.raw.mtx)
print(plt)
.file <- fig.dir %&% "/Fig_round1_cdmarker_full.pdf"
.gg.save(filename = .file, plot = plt, width=6, height=5)
.file <- "result/step2/full_bbknn_louvain.txt.gz"
.cells <- readLines(.data$col)
if.needed(.file, {
.bbknn.umap <- run.bbknn.umap(.bbknn, .cells, spread=3)
fwrite(.bbknn.umap, .file)
})
.bbknn.umap <- fread(.file) %>%
left_join(annot.dt, by = "tag") %>%
mutate(membership = as.factor(membership))
p1 <-
.gg.plot(.bbknn.umap, aes(umap1, umap2, color=membership)) +
ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300)
p2 <-
.gg.plot(.bbknn.umap, aes(umap1, umap2, color=celltype)) +
ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300) +
scale_color_brewer(palette = "Paired")
plt <- p1 | p2
print(plt)
.file <- fig.dir %&% "/Fig_bbknn_full_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)
.ct.mem <- .bbknn.umap[, .(nc = .N), by = .(celltype, membership)]
.ct.mem[, ntot := sum(`nc`), by = .(membership)]
.ct.mem[, pr := `nc`/`ntot`]
.df <-
.ct.mem %>%
mutate(row = celltype, col = membership,
weight = logit(pmax(pmin(pr, 1-1e-2), 1e-2))) %>%
order.pair(ret.tab = TRUE)
plt <-
.gg.plot(.df, aes(col, row, fill=weight, size = `nc`)) +
xlab("Louvain clustering") +
ylab("cell type") +
theme(legend.position = "bottom") +
geom_point(pch=22) +
scale_size_continuous("#cells", range=c(0,5)) +
scale_fill_distiller("proportion",
palette="PuRd", direction = 1,
labels=function(x) round(sigmoid(x), 2))
print(plt)
.file <- fig.dir %&% "/Fig_bbknn_full_cluster.pdf"
.gg.save(filename = .file, plot = plt, width=5, height=2)
.valid.clust <- .ct.mem[order(pr, decreasing = T),
head(.SD, 1),
by = .(membership)]
.valid.celltype.cluster <-
.valid.clust[pr > .5, .(membership, celltype)] %>%
unique()
celltype.final <-
.bbknn.umap %>%
rename(celltype.protein = celltype) %>%
(function(x) left_join(.valid.celltype.cluster, x)) %>%
na.omit()
DOWNLOAD: Cell type estimation
p1 <-
.gg.plot(celltype.final, aes(umap1, umap2, color=membership)) +
ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300)
p2 <-
.gg.plot(celltype.final, aes(umap1, umap2, color=celltype)) +
ggrastr::rasterise(geom_point(stroke=0, alpha=.8, size=.5), dpi=300) +
scale_color_brewer(palette = "Paired")
plt <- p1 | p2
print(plt)
.file <- fig.dir %&% "/Fig_bbknn_final_umap.pdf"
.gg.save(filename = .file, plot = plt, width=8, height=3)
.markers <-
c("FOXP3", "ID3", "BACH2", "CXCR3", "PRDM1", "SGK1", "TCF7", "LEF1",
"SELL", "IL2RA", "IL7R", "IKZF2", "CCR6", "CCR4", "CCR7", "CTLA4",
"HLA-DRA", "anti_CD25", "anti_CD127", "anti_CD183", "anti_CD196",
"anti_CD197", "anti_CD194", "anti_CD45RA", "anti_CD45RO",
"anti_HLA") %>%
unique
.marker.hdr <- "result/step2/marker/marker"
.mkdir(dirname(.marker.hdr))
.marker.data <- fileset.list(.marker.hdr)
if.needed(.marker.data, {
.marker.data <-
rcpp_mmutil_copy_selected_rows(.data$mtx,
.data$row,
.data$col,
.markers,
.marker.hdr)
})
Note: these plots show expression values without batch adjustment
.markers <- sort(as.character(unique(marker.dt$marker)))
for(g in .markers){
plt <- plot.marker.umap(g)
print(plt)
.file <- fig.dir %&% "/Fig_marker_" %&% g %&% "_umap.pdf"
.gg.save(filename = .file, plot = plt, width=3.2, height=2.8)
}
.hash.info <- fread("data/cell_info_batch3-7.csv.gz") %>%
rename(tag = cbid, subject = id) %>%
select(-`V1`) %>%
mutate(disease = substr(`subject`, 1, 2)) %>%
mutate(hash = gsub(pattern="[a-zA-Z\\-]+", replacement="", `hash`))
.sample.info <-
readxl::read_xlsx("data/Hashing list MS Treg project.xlsx", 1) %>%
rename(Sample = `Cell type`) %>%
mutate(hash = gsub("#","",`hash`))
.dt <-
celltype.final %>%
mutate(batch=as.integer(batch)) %>%
left_join(.hash.info) %>%
mutate(ct=factor(celltype, c("nTconv","mTconv","nTreg","mTreg"))) %>%
left_join(.sample.info)